MatMulFusion
矩阵乘法融合(可选偏置和激活),计算:
其中 A 形状为 \(M\times K\),B 为 \(K\times N\),C 与可选的 bias 为 \(M\times N\)。
激活 act 支持:
0: 无激活(Identity)1: ReLU2: ReLU6
- 输入:
A - 输入矩阵 A(行优先,连续存储)。大小 M×K。
B - 输入矩阵 B(行优先,连续存储)。大小 K×N。
bias - 偏置矩阵(可为 NULL)。当
bias_broadcast=1时大小为 N,否则为 M×N。- params - 参数打包成数组(共7个元素):
params[0] (M) - 维度参数。
params[1] (N) - 维度参数。
params[2] (K) - 维度参数。
params[3] (activation_type) - 激活类型,取值 {0,1,2}。
params[4] (A_transpose) - A矩阵是否转置,取值 {0,1}。
params[5] (B_transpose) - B矩阵是否转置,取值 {0,1}。
params[6] (bias_broadcast) - 偏置是否广播,取值 {0,1}。
core_mask(可选) - 核掩码(仅适用于共享存储版本)。
- 输出:
C - 输出矩阵(行优先,大小 M×N)。
- 支持平台:
FT78NEMT7004
备注
FT78NE 支持int8, int16, int32, fp32, fp64, cplx64, cplx128
MT7004 支持fp16, fp32, int16, int32, cplx64
复数类型的激活逐分量应用于实部与虚部
请确保输入按行优先连续布局,且不发生类型范围溢出;int8/int16/int32 计算未做饱和裁剪
转置操作通过参数控制,无需预先转置矩阵
共享存储版本:
-
void i8_matmul_fusion_s(int8_t *A, int8_t *B, int8_t *C, int8_t *bias, long long *params, int core_mask)
-
void i16_matmul_fusion_s(int16_t *A, int16_t *B, int16_t *C, int16_t *bias, long long *params, int core_mask)
-
void i32_matmul_fusion_s(int *A, int *B, int *C, int *bias, long long *params, int core_mask)
-
void hp_matmul_fusion_s(half *A, half *B, half *C, half *bias, long long *params, int core_mask)
-
void fp_matmul_fusion_s(float *A, float *B, float *C, float *bias, long long *params, int core_mask)
-
void dp_matmul_fusion_s(double *A, double *B, double *C, double *bias, long long *params, int core_mask)
-
void c64_matmul_fusion_s(float *A, float *B, float *C, float *bias, long long *params, int core_mask)
-
void c128_matmul_fusion_s(double *A, double *B, double *C, double *bias, long long *params, int core_mask)
C调用示例:
1#include <stdio.h> 2#include <stdbool.h> 3 4int main(int argc, char* argv[]) { 5 float* A_ref = (float*)0x90000000; 6 float* B_ref = (float*)0x91000000; 7 float* C_ref = (float*)0x92000000; 8 float* bias_ref = (float*)0x93000000; 9 10 float* C_multi = (float*)0x95000000; 11 int core_mask = 0b1111; // 使用4核 12 13 int M = 64; 14 int N = 64; 15 int K = 64; 16 17 bool bias_broadcast = true; 18 bool A_transpose = false; 19 bool B_transpose = true; 20 21 // Initialize test data (core 0 only) 22 if (coreid == 0) { 23 24 // Initialize A, B, bias with small values 25 for (int i = 0; i < M * K; ++i) { 26 A_ref[i] = (float)(i % 10) * 0.1f; 27 } 28 for (int i = 0; i < K * N; ++i) { 29 B_ref[i] = (float)(i % 10) * 0.1f; 30 } 31 for (int i = 0; i < M * N; ++i) { 32 C_ref[i] = 0.0f; 33 C_multi[i] = 0.0f; 34 bias_ref[i] = (float)(i % 5) * 0.01f; 35 } 36 } 37 long long params[7]; 38 params[0] = (long long)M; 39 params[1] = (long long)N; 40 params[2] = (long long)K; 41 params[3] = (long long)ACTIVATION_RELU; 42 params[4] = (long long)A_transpose; 43 params[5] = (long long)B_transpose; 44 params[6] = (long long)bias_broadcast; 45 46 fp_matmul_fusion_s(A_ref, B_ref, C_multi, bias_ref, params, core_mask); 47 return 0; 48}
私有存储版本:
-
void i8_matmul_fusion_p(int8_t *A, int8_t *B, int8_t *C, int8_t *bias, long long *params)
-
void i16_matmul_fusion_p(int16_t *A, int16_t *B, int16_t *C, int16_t *bias, long long *params)
-
void i32_matmul_fusion_p(int *A, int *B, int *C, int *bias, long long *params)
-
void hp_matmul_fusion_p(half *A, half *B, half *C, half *bias, long long *params)
-
void fp_matmul_fusion_p(float *A, float *B, float *C, float *bias, long long *params)
-
void dp_matmul_fusion_p(double *A, double *B, double *C, double *bias, long long *params)
-
void c64_matmul_fusion_p(float *A, float *B, float *C, float *bias, long long *params)
-
void c128_matmul_fusion_p(double *A, double *B, double *C, double *bias, long long *params)
C调用示例:
1#include <stdio.h> 2#include <stdbool.h> 3 4int main(int argc, char* argv[]) { 5 float* A_ref = (float*)0x10010000; 6 float* B_ref = (float*)0x10020000; 7 float* C_ref = (float*)0x10030000; 8 float* bias_ref = (float*)0x10040000; 9 10 float* C_single = (float*)0x10050000; 11 12 int M = 8; 13 int N = 8; 14 int K = 8; 15 16 bool bias_broadcast = true; 17 bool A_transpose = true; 18 bool B_transpose = false; 19 20 // Initialize A, B, bias with small values 21 for (int i = 0; i < M * K; ++i) { 22 A_ref[i] = (float)(i % 10) * 0.1f; 23 } 24 for (int i = 0; i < K * N; ++i) { 25 B_ref[i] = (float)(i % 10) * 0.1f; 26 } 27 for (int i = 0; i < M * N; ++i) { 28 C_ref[i] = 0.0f; 29 C_single[i] = 0.0f; 30 bias_ref[i] = (float)(i % 5) * 0.01f; 31 } 32 33 long long params[7]; 34 35 params[0] = (long long)M; 36 params[1] = (long long)N; 37 params[2] = (long long)K; 38 params[3] = (long long)ACTIVATION_RELU; 39 params[4] = (long long)A_transpose; 40 params[5] = (long long)B_transpose; 41 params[6] = (long long)bias_broadcast; 42 43 fp_matmul_fusion_p(A_ref, B_ref, C_single, bias_ref, params); 44 return 0; 45}